source("./scripts/startup.R")
The following packages have been unloaded:
ggforestplot, doParallel, iterators, foreach, glmmTMB, broom, tableone, arrow, magrittr, scales, lubridate, ggExtra, ggsci, pheatmap, Biostrings, GenomeInfoDb, XVector, IRanges, S4Vectors, BiocGenerics, ggrepel, data.table, ggpubr, viridis, viridisLite, gggenes, ggforce, glmnet, Matrix, cowplot, forcats, stringr, dplyr, purrr, readr, tidyr, tibble, ggplot2, tidyverse, pacman
Loading required package: pacman
Demonstrate whether a linear regression can be fit or not # INSERT LASSO DISTRIBUTION FOR LOG2 ON z transformed CONT. INPUTS
#p_sub<- p %>% left_join(mcov_samples_filtered) #to add info about coverage
lasso_xy = function(x, y, family = "gaussian") {
cv_model <- cv.glmnet(x, y, alpha = 1, family = family, nfolds=100)
plot(cv_model)
best_model <- glmnet(x, y, alpha = 1, family = family, lambda = cv_model$lambda.min)
best_model$dev.ratio
lasso_coef = coef(best_model) %>% as.matrix() %>% data.frame() %>% rownames_to_column() %>%
select(factor=rowname, coefficient=s0) %>% filter(factor!="(Intercept)") %>%
arrange(desc(coefficient))
return(lasso_coef)
}
normalize <- function(x, na.rm = TRUE) {
scaled = (x- min(x)) /(max(x)-min(x))
return()
}
Looks like it’s important to remove the HIV and transplant because of sparsity of data.
scan_factors = c('Duration','age18under','age55plus','sex','chronic_lung_disease',
'chronic_liver_disease', 'chronic_kidney_disease', 'chronic_heart_disease',
'transplant_patient', 'hiv', 'hypertension', 'diabetes', 'cancer', 'obesity',
'plasma', 'mAb', 'admitted_hospital','vaccine_status',
'vocAlpha','vocDelta','collection_month',
'surveillance','CT','median_coverage','run', 'PUI')
scale_scan_factors = function(patient_var_tmp, scan_factors) {
p_sub = patient_var_tmp %>%
select(MCoVNumber,lineage,Duration,COLLECTION_DT:last_col(),one_of(scan_factors)) %>%
unique %>% as.data.frame()
p_sub[, colnames(p_sub) %in% scan_factors]
p_sub_scaled = p_sub %>% select(one_of(scan_factors)) %>%
mutate_if(is.numeric, scale) %>% # scale will z scale it, instead of the normalize fx above which is min max
cbind(p_sub %>% select(!one_of(scan_factors)))
return(p_sub_scaled)
}
patient_var_tmp = read_feather("processing/patient_var_tmp.arrow")
scan_factors_trim = scan_factors[!scan_factors %in% c("transplant_patient", "hiv")]
p_sub_scaled = scale_scan_factors(
patient_var_tmp %>% filter(n_var < 30 & CT < 26),
scan_factors_trim)
y<-log2(p_sub_scaled$n_var+1)
x<-(p_sub_scaled[, colnames(p_sub_scaled) %in% scan_factors_trim]) %>% data.matrix()
lasso_under_30 = lasso_xy(x,y) %>% mutate(cutoff = "n_var_under_30") %>%
arrange(coefficient) %>% mutate(factor = fct_reorder(factor, coefficient))
lasso_initial_features = lasso_under_30 %>% ggplot(aes(x=factor, y = coefficient)) +
geom_bar(stat="identity") + coord_flip()
lasso_initial_features
ggsave("ggsave/lasso_initial_features.pdf", lasso_initial_features, height = 4, width = 3)
admitted_matrix = cbind(x,#[x[,"admitted_hospital"] == 1,],
n_var = normalize(y))
clinical_cor_heatmap = pheatmap(abs(cor(admitted_matrix, method = "spearman")),
color = viridis(10), border_color = NA)
ggsave("ggsave/clinical_cor_heatmap.pdf", clinical_cor_heatmap, height = 6, width = 6)
check_levels = function(x) { return(levels(x) %>% length ==2) }
numeric1 = function(x) { return(as.numeric(x)-1) }
plot_feature_n = function(p_sub_scaled) {
sum_n = nrow(p_sub_scaled)
feature_stats_IP = p_sub_scaled %>% select(one_of(scan_factors)) %>%
filter(admitted_hospital == 1) %>%
select_if(is.factor) %>% select_if(check_levels) %>%
mutate_if(is.factor, numeric1) %>% colSums() %>%
data.frame(feature = names(.), IP = .)
sum_n_IP = feature_stats_IP["admitted_hospital","IP"]
feature_stats_OP = p_sub_scaled %>% select(one_of(scan_factors)) %>%
filter(admitted_hospital == 0) %>%
select_if(is.factor) %>% select_if(check_levels) %>%
mutate_if(is.factor, numeric1) %>% colSums() %>%
data.frame(feature = names(.), OP = .)
sum_n_OP = sum_n - sum_n_IP
feature_stats = left_join(feature_stats_IP, feature_stats_OP)
slope = sum_n_OP/sum_n_IP
feature_n_plot = feature_stats %>% ggplot(aes(x = IP, y = OP, label = feature)) +
geom_abline(slope = slope, color = "red", linetype = "dashed") +
geom_point() + geom_text_repel() +
scale_y_continuous( breaks = seq(0,1500,100), limits = c(0,1400),
sec.axis = sec_axis(~ . / sum_n_OP, labels = scales::label_percent(),
breaks = seq(0,1,0.1))) +
scale_x_continuous( breaks = seq(0,2000,100), guide = guide_axis(angle = 90),
sec.axis = sec_axis(~ . / sum_n_IP, labels = scales::label_percent(),
breaks = seq(0,1,0.1))) + theme_pubr() +
xlab(paste0("Inpatient (n & % of ", sum_n_IP, ")")) +
ylab(paste0("Outpatient (n & % of ", sum_n_OP, ")"))
return(feature_n_plot)
}
undated_feature_n_plot = plot_feature_n(p_sub_scaled)
Joining, by = "feature"
julyonwards_feature_n_plot = plot_feature_n(p_sub_scaled %>%
filter(collection_date>="2021-07-01"))
Joining, by = "feature"
feature_n_plot = ggarrange(undated_feature_n_plot + ggtitle("12/2020 - 11/2021"),
julyonwards_feature_n_plot + ggtitle("07/2021 - 11/2021"), align = "h",
labels = list("A","B"))
feature_n_plot
Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggsave("ggsave/feature_n_plot.pdf", feature_n_plot, width = 16, height = 8)
EXPLORATION OF THE ADMITTED
patient_counts_30 = read_feather("processing/patient_counts_30.arrow")
patient_counts_30 = patient_counts_30 %>% #filter(collection_date >="2021-07-01") %>%
mutate(admitted_hospital = as.factor(admitted_hospital))
ct_date_plot = function(patient_counts_30, admitted_hospital = "admitted_hospital") {
plot_admitted = patient_counts_30 %>%
ggplot(aes(x = INSTRUMENT_RESULT, y = n_var, color = !!sym(admitted_hospital))) +
geom_point(shape = "") +
geom_density_2d(aes(color = !!sym(admitted_hospital))) +
stat_cor(method = "spearman", cor.coef.name = "rho") +
geom_smooth(se=TRUE) + theme_pubr() + scale_y_continuous(trans="log1p",
breaks = c(0,1,2,5,10,15,20,25,30)) +
scale_x_continuous(limits = c(10,26))
plot_admitted_marginal_by_ct = ggMarginal(plot_admitted, type = "violin",
draw_quantiles = c(0,0.25,0.5, 0.75),
groupColour = TRUE, groupFill = TRUE)
# By Date now:
plot_admitted_date = patient_counts_30 %>% group_by(admitted_hospital) %>%
ggplot(aes(x = collection_date, y = n_var, color = !!sym(admitted_hospital))) +
geom_point(shape = "") +
geom_density_2d(aes(color = !!sym(admitted_hospital))) +
stat_cor(method = "spearman", cor.coef.name = "rho") +
geom_smooth(se = TRUE) + theme_pubr() + scale_y_continuous(trans="log1p",
breaks = c(0,1,2,5,10,15,20,25,30))
plot_admitted_date_marginal = ggMarginal(plot_admitted_date, groupColour = TRUE, groupFill = TRUE)
plot_admitted_marginal_combined = ggarrange(plot_admitted_marginal_by_ct,
plot_admitted_date_marginal, align = "h")
}
plot_admitted_marginal_combined = ct_date_plot(patient_counts_30, "admitted_hospital")
Warning: Removed 12 rows containing non-finite values (`stat_density2d()`).
Warning: Removed 12 rows containing non-finite values (`stat_cor()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 12 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12 rows containing non-finite values (`stat_density2d()`).
Warning: Removed 12 rows containing non-finite values (`stat_cor()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 12 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12 rows containing missing values (`geom_point()`).
Warning: `position_dodge()` requires non-overlapping x intervals
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_admitted_marginal_combined
ggsave("ggsave/plot_admitted_marginal_combined.pdf",
plot_admitted_marginal_combined, height = 4.5, width = 9)
OTHER FACTORS
# VACCINATION GROUPS HISTOGRAM
vaccine_by_date_hist = patient_counts_30 %>% select(collection_month, vaccine_status, admitted_hospital) %>%
filter(vaccine_status == 1) %>% ggplot(aes(x = collection_month)) +
geom_histogram(stat="count") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.background = element_rect(colour = "black", fill=NA, linewidth =2)) +
ylab("vaccine count")
Warning in geom_histogram(stat = "count"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
vaccine_by_date_hist
ggsave("ggsave/vaccine_by_date_hist.pdf", vaccine_by_date_hist, height = 2, width = 2)
Warning: 'mode(bg)' differs between new and previous
==> NOT changing 'bg'
# ADMITTED GROUPS HISTOGRAM
patient_counts_30 %>% select(collection_month, vaccine_status, admitted_hospital) %>%
filter(admitted_hospital == 1) %>% ggplot(aes(x = collection_month)) +
geom_histogram(stat="count")
Warning in geom_histogram(stat = "count"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
# Load in the TOTAL HISTOGRAM
lineages_histogram = readRDS("ggsave/lineages_figure.rds")
# combine the histograms
n_vax_admit = patient_counts_30 %>% select(COLLECTION_DT, vaccine_status, admitted_hospital) %>%
mutate(month = floor_date(COLLECTION_DT, "month")) %>% group_by(month) %>%
summarize(vaccinated = sum(as.numeric(as.character(vaccine_status))),
hospitalized = sum(as.numeric(as.character(admitted_hospital)))) %>%
pivot_longer(!month, names_to = "trends", values_to = "count")
lineages_histogram_vax_hosp = lineages_histogram +
geom_line(data = n_vax_admit, aes(month, count, color = trends, linetype = trends)) +
scale_color_manual(values = c("grey10", "black")) + theme(legend.position = "right")
# save the output
ggsave("ggsave/lineages_histogram_vax_hosp.pdf", lineages_histogram_vax_hosp, height = 4, width = 4)
# CT VACCINATED
plot_vaccinated_marginal_by_ct = ct_date_plot(patient_counts_30 %>%
filter(collection_date >="2021-07-01" & admitted_hospital == 1), "vaccine_status")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: `position_dodge()` requires non-overlapping x intervals
`position_dodge()` requires non-overlapping x intervals
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plot_vaccinated_marginal_by_ct
ggsave("ggsave/plot_vaccinated_marginal_by_ct.pdf", plot_vaccinated_marginal_by_ct, height = 5, width = 5)
# cancer
plot_cancer_marginal_by_ct = ct_date_plot(patient_counts_30 %>%
filter(collection_date <="2021-07-01"), "cancer")
Warning: Removed 11 rows containing non-finite values (`stat_density2d()`).
Warning: Removed 11 rows containing non-finite values (`stat_cor()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 11 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 11 rows containing non-finite values (`stat_density2d()`).
Warning: Removed 11 rows containing non-finite values (`stat_cor()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 11 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 11 rows containing missing values (`geom_point()`).
Warning: `position_dodge()` requires non-overlapping x intervals
`position_dodge()` requires non-overlapping x intervals
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plot_cancer_marginal_by_ct
test = ct_date_plot(patient_counts_30, "cancer")
Warning: Removed 12 rows containing non-finite values (`stat_density2d()`).
Warning: Removed 12 rows containing non-finite values (`stat_cor()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 12 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12 rows containing non-finite values (`stat_density2d()`).
Warning: Removed 12 rows containing non-finite values (`stat_cor()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 12 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12 rows containing missing values (`geom_point()`).
Warning: `position_dodge()` requires non-overlapping x intervals
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
test
# VACCINATION GROUPS WITH CONTROL OVER THE TIME & OUTPATIENT STATUS
plot_vaccinated = patient_counts_30 %>%
filter(collection_date>="2021-07-01") %>%
filter(admitted_hospital == 1) %>%
group_by(mAb) %>%
ggplot(aes(x = INSTRUMENT_RESULT, y = n_var, color = mAb)) +
geom_point(shape = "") +
geom_density_2d(aes(color = mAb)) + stat_cor(method = "spearman", cor.coef.name = "rho") +
geom_smooth(se=TRUE) + theme_pubr() + scale_y_continuous(trans="log1p",
breaks = c(0,1,2,5,10,15,20,25,30)) +
scale_x_continuous(limits = c(10,26))
plot_vaccinated_marginal_by_ct = ggMarginal(plot_vaccinated, type = "boxplot",
draw_quantiles = c(0,0.25,0.5, 0.75),
groupColour = TRUE, groupFill = TRUE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning in (function (mapping = NULL, data = NULL, stat = "boxplot", position = "dodge2", : Ignoring unknown parameters: `draw_quantiles`
Ignoring unknown parameters: `draw_quantiles`
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plot_vaccinated_marginal_by_ct
#ggsave("ggsave/plot_mAb_marginal_by_ct.pdf", plot_vaccinated_marginal_by_ct, height = 5, width = 5)
# HCW GROUPS WITH CONTROL OVER THE TIME & OUTPATIENT STATUS
table(patient_counts_30 %>% select(admitted_hospital,surveillance))
surveillance
admitted_hospital 0 1
0 2975 682
1 2048 20
plot_hcw= patient_counts_30 %>% filter(admitted_hospital == 0) %>% filter(collection_date>="2021-07-01") %>%
group_by(surveillance) %>%
ggplot(aes(x = INSTRUMENT_RESULT, y = n_var, color = vaccine_status)) +
geom_point(shape = "") +
geom_density_2d(aes(color = vaccine_status)) +
stat_cor(method = "spearman", cor.coef.name = "rho") +
geom_smooth(se=TRUE) + theme_pubr() + scale_y_continuous(trans="log1p",
breaks = c(0,1,2,5,10,15,20,25,30))
((
plot_hcw_marginal_by_ct = ggMarginal(plot_hcw, type = "violin",
draw_quantiles = c(0,0.25,0.5, 0.75),
groupColour = TRUE, groupFill = TRUE)
))
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: `position_dodge()` requires non-overlapping x intervals